started: 01Mar2016
last updated: 19Jan2017

Notes

The filters are applied in the following order:
gq > 20
dp < 500
call_rate > 0.8

gq 20 filter is set arbitrary; however, it is consistent with what is done by others (e.g. see Carson BMC Bioinformatics. 2014 15:125).

A small number of genotypes (<<1%) was covered too high to be true (up to 1-2k coverage). These are obvious mistakes, and they have been removed too. Arbitrarily the threshold for max DP was set to 500 (appr. 10 fold of average coverage).

It was discussed with DC whether to filter cases by call rate per case. There was ~3 cases with low coverage (<20) and low call rates (<50%). We desided to keep such cases because their retained genotypes still passed all filters.

start_section

# Time stamp
Sys.time()
[1] "2017-01-19 20:53:24 GMT"
# Folders
setwd("/scratch/medgen/scripts/wecare_stat_01.17/scripts")
interim_data_folder <- "/scratch/medgen/scripts/wecare_stat_01.17/interim_data"
# Thresholds for genotypes
min.gq <- 20
max.dp <- 500
# Variants call ratethreshold
min.call.rate <- 0.8

load_data

load(paste(interim_data_folder, "r01_read_and_clean_data_jan2017.RData", sep="/"))

check_data

dim(gt.mx)
[1] 343824    710
class(gt.mx)
[1] "matrix"
gt.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001      NA       0       0      NA      NA
Var000000002       0       0      NA      NA       0
Var000000003       0       0       0       0       0
Var000000004       0       0       0       0       0
Var000000005       0      NA       0       0       0
dim(gq.mx)
[1] 343824    710
class(gq.mx)
[1] "matrix"
gq.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001      NA       9       3      NA      NA
Var000000002      99      99      NA      NA      58
Var000000003      36      99      21      12      21
Var000000004      36      99      18       3      21
Var000000005      36      NA       9      99      33
dim(dp.mx)
[1] 343824    710
class(dp.mx)
[1] "matrix"
dp.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000001       0       4       1       0      NA
Var000000002      74     169       0       0     130
Var000000003      13      35       7       4       8
Var000000004      13      35       7       1       8
Var000000005      28      NA      10      37      20
dim(covar.df)
[1] 498  34
str(covar.df)
'data.frame':   498 obs. of  34 variables:
 $ Subject_ID      : int  200054 200491 200565 200698 200958 201046 201558 201921 202026 202236 ...
 $ setno           : int  382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
 $ cc              : int  0 0 0 0 0 0 1 1 0 0 ...
 $ chemo           : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone         : int  0 1 1 0 1 0 0 1 1 0 ...
 $ chemo_hormone   : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
 $ chemo_self_mra  : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone_self_mra: int  0 1 1 0 1 0 0 1 1 0 ...
 $ treatment       : int  1 1 1 0 1 1 1 1 1 1 ...
 $ ID              : int  2 6 7 9 11 12 16 22 24 26 ...
 $ labid           : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ status          : int  0 0 0 0 0 0 1 1 0 0 ...
 $ status2         : int  0 0 0 0 0 0 1 1 0 0 ...
 $ offset          : num  6.41 5.82 5.61 4.03 4.77 ...
 $ sub_dx_age      : int  46 50 46 44 43 39 45 40 43 36 ...
 $ XRTBreast       : int  0 0 0 1 0 1 0 0 1 0 ...
 $ Eigen_1         : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
 $ Eigen_2         : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
 $ Eigen_3         : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
 $ Eigen_4         : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
 $ Eigen_5         : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
 $ dose            : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
 $ dsmiss          : int  0 0 0 0 1 0 0 0 0 0 ...
 $ good_location   : int  1 1 0 1 1 1 0 1 0 0 ...
 $ Deleterious     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ registry        : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
 $ race            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age_stratum1    : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
 $ dxyr_stratum    : int  2 2 3 1 1 2 1 2 3 2 ...
 $ CMF_Only        : int  1 0 0 0 0 1 0 0 1 1 ...
 $ family_history  : int  0 0 0 0 0 0 1 0 0 0 ...
 $ sub_dx_age_cat  : int  0 0 0 1 1 1 0 1 1 1 ...
 $ CMF             : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
 $ XRTBrCHAR       : int  0 0 0 1 0 1 0 0 1 0 ...
covar.df[1:5,1:5]
dim(samples.df)
[1] 512   4
str(samples.df)
'data.frame':   512 obs. of  4 variables:
 $ wes_id   : Factor w/ 512 levels "P1_A01","P1_A02",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ gwas_id  : Factor w/ 510 levels "id200054","id200491",..: 14 405 315 264 67 121 326 251 281 141 ...
 $ merged_id: Factor w/ 512 levels "P1_A01_id203568",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ filter   : Factor w/ 5 levels "duplicate","low_concordance",..: 5 5 5 5 5 5 5 5 5 5 ...
samples.df[1:5,]
dim(demographics.df)
[1] 505  91
str(demographics.df)
'data.frame':   505 obs. of  91 variables:
 $ Subject_ID            : Factor w/ 503 levels "200054","200491",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ ID.x                  : num  2 6 7 9 11 12 NA NA 24 26 ...
 $ labid.x               : Factor w/ 498 levels "15582015","19212019",..: 6 7 8 9 10 11 NA NA 12 13 ...
 $ Eigen_1.x             : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
 $ Eigen_2.x             : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
 $ Eigen_3.x             : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
 $ Eigen_4.x             : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
 $ Eigen_5.x             : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
 $ Phase                 : num  1 1 1 1 1 1 NA NA 1 1 ...
 $ setno.x               : num  382125 204356 360683 226881 357431 ...
 $ cc.x                  : int  0 0 0 0 0 0 NA NA 0 0 ...
 $ rstime                : num  7.42 9.75 6.09 6.25 9.34 7 NA NA 6.09 8.66 ...
 $ registry.x            : Factor w/ 7 levels "","IA","IR","LA",..: 6 3 3 5 5 4 NA NA 2 6 ...
 $ race.x                : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ offset.x              : num  6.41 5.82 5.61 4.03 4.77 ...
 $ DOB                   : num  -5049 -7459 -3916 -5890 -6407 ...
 $ sub_dx_age.x          : num  46 50 46 44 43 39 NA NA 43 36 ...
 $ refage                : num  53 59 51 50 52 45 NA NA 48 44 ...
 $ BMI_age18             : num  20.2 19.7 23.3 19.5 25.8 ...
 $ BMI_dx                : num  22.8 20.9 23.3 32.6 25.8 ...
 $ BMI_ref               : num  25.7 22 25.8 32.6 28.1 ...
 $ hormone_self_mra.x    : num  0 1 1 0 1 0 NA NA 1 0 ...
 $ chemo_self_mra.x      : num  1 1 0 0 1 1 NA NA 1 1 ...
 $ treatment.x           : num  1 1 1 0 1 1 NA NA 1 1 ...
 $ family_history.x      : Factor w/ 3 levels "1+","none","othe": 2 2 2 2 2 2 NA NA 2 2 ...
 $ rh_age_menarche       : num  13 14 13 13 12 9 NA NA 12 11 ...
 $ age_menopause_1yrbf_fd: num  -1 48 -1 -1 -1 -1 NA NA -1 -1 ...
 $ age_1fftp_fd          : num  20 22 0 29 28 0 NA NA 27 24 ...
 $ Num_ftp_fd            : num  1 2 -1 2 2 -1 NA NA 3 2 ...
 $ Histology             : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
 $ Histology1            : Factor w/ 4 levels "lobular","medullar",..: 3 3 3 2 3 3 NA NA 3 3 ...
 $ Hist_lob_other        : Factor w/ 4 levels "lobular","other",..: 2 2 2 2 2 2 NA NA 2 2 ...
 $ stage_fd              : num  2 1 1 1 2 2 NA NA 2 2 ...
 $ er_fd                 : num  1 1 1 4 1 2 NA NA 2 2 ...
 $ pr_fd                 : num  1 1 1 2 1 2 NA NA 1 0 ...
 $ histo1_cat            : Factor w/ 9 levels "Tubular/mucinous",..: 9 4 4 6 4 4 NA NA 4 4 ...
 $ er1_cat               : Factor w/ 5 levels "negative","own unkn",..: 3 3 3 5 3 1 NA NA 1 1 ...
 $ pr1_cat               : Factor w/ 7 levels "negative","own 0 Ot",..: 3 3 3 1 3 1 NA NA 3 7 ...
 $ status.x              : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ status2.x             : Factor w/ 4 levels "","0","1","h": 2 2 2 2 2 2 NA NA 2 2 ...
 $ sub_race              : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ er1_num               : num  1 1 1 NA 1 0 NA NA 0 0 ...
 $ er1                   : int  1 1 1 NA 1 0 NA NA 0 0 ...
 $ horm_tmx              : num  0 1 1 0 2 0 NA NA 1 0 ...
 $ XRTBreast.x           : num  0 0 0 1 0 1 NA NA 1 0 ...
 $ dose_caseloc          : num  0 0 0 0.96 NA 0.88 NA NA 0.76 0 ...
 $ good_location.x       : num  1 1 0 1 1 1 NA NA 0 0 ...
 $ avedose               : num  0 0 0 1.65 NA 1.45 NA NA 0.91 0 ...
 $ tmx                   : num  0 1 1 0 0 0 NA NA 1 0 ...
 $ num_preg              : num  1 1 0 1 1 0 NA NA 1 1 ...
 $ fam_hist              : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ age_menarche          : int  1 1 1 1 0 0 NA NA 0 0 ...
 $ lobular               : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ age_menopause         : int  0 2 0 0 0 0 NA NA 0 0 ...
 $ conf_miss             : num  0 0 0 0 0 0 NA NA 0 0 ...
 $ dose_num              : num  0 0 0 1 NA 1 NA NA 1 0 ...
 $ cmf                   : Factor w/ 6 levels "","\024\x9c@",..: 3 4 5 5 3 3 NA NA 3 3 ...
 $ cmf_012               : num  1 2 0 0 1 1 NA NA 1 1 ...
 $ setno.y               : int  382125 204356 360683 226881 357431 374980 201558 201921 213991 385058 ...
 $ cc.y                  : int  0 0 0 0 0 0 1 1 0 0 ...
 $ chemo                 : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone               : int  0 1 1 0 1 0 0 1 1 0 ...
 $ chemo_hormone         : Factor w/ 5 levels "","both","chem",..: 3 2 4 5 2 3 3 2 2 3 ...
 $ chemo_self_mra.y      : int  1 1 0 0 1 1 1 1 1 1 ...
 $ hormone_self_mra.y    : int  0 1 1 0 1 0 0 1 1 0 ...
 $ treatment.y           : int  1 1 1 0 1 1 1 1 1 1 ...
 $ ID.y                  : int  2 6 7 9 11 12 16 22 24 26 ...
 $ labid.y               : Factor w/ 498 levels "id200054","id200491",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ status.y              : int  0 0 0 0 0 0 1 1 0 0 ...
 $ status2.y             : int  0 0 0 0 0 0 1 1 0 0 ...
 $ offset.y              : num  6.41 5.82 5.61 4.03 4.77 ...
 $ sub_dx_age.y          : int  46 50 46 44 43 39 45 40 43 36 ...
 $ XRTBreast.y           : int  0 0 0 1 0 1 0 0 1 0 ...
 $ Eigen_1.y             : num  -0.0079 -0.01062 -0.00539 -0.00906 -0.01094 ...
 $ Eigen_2.y             : num  0.0055 0.00414 0.00302 0.00301 0.0023 ...
 $ Eigen_3.y             : num  -0.02171 0.01254 0.00368 -0.00655 -0.01389 ...
 $ Eigen_4.y             : num  0.00356 -0.00787 -0.01496 -0.01256 -0.00501 ...
 $ Eigen_5.y             : num  0.00135 -0.0291 0.00391 0.01357 -0.00526 ...
 $ dose                  : Factor w/ 3 levels "ge 1Gy","ls 1Gy",..: 3 3 3 2 3 2 3 3 2 3 ...
 $ dsmiss                : int  0 0 0 0 1 0 0 0 0 0 ...
 $ good_location.y       : int  1 1 0 1 1 1 0 1 0 0 ...
 $ Deleterious           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ registry.y            : Factor w/ 5 levels "IA","IR","LA",..: 5 2 2 4 4 3 2 1 1 5 ...
 $ race.y                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ age_stratum1          : Factor w/ 5 levels "20to34","35to39",..: 4 4 4 3 3 2 3 3 3 2 ...
 $ dxyr_stratum          : int  2 2 3 1 1 2 1 2 3 2 ...
 $ CMF_Only              : int  1 0 0 0 0 1 0 0 1 1 ...
 $ family_history.y      : int  0 0 0 0 0 0 1 0 0 0 ...
 $ sub_dx_age_cat        : int  0 0 0 1 1 1 0 1 1 1 ...
 $ CMF                   : Factor w/ 3 levels "CMF","Oth","no": 1 2 3 3 2 1 2 2 1 1 ...
 $ XRTBrCHAR             : int  0 0 0 1 0 1 0 0 1 0 ...
demographics.df[1:5,1:5]
dim(phenotypes_update.df)
[1] 13 22
str(phenotypes_update.df)
'data.frame':   13 obs. of  22 variables:
 $ gwas_id       : chr  "id319270" "id298378" "id271981" "id301570" ...
 $ wes_id        : chr  "P1_C02" "P1_C04" "P1_C06" "P1_C11" ...
 $ cc            : int  0 0 0 0 1 0 1 1 0 0 ...
 $ setno         : int  227587 243637 332699 247333 204830 204830 310424 247333 398607 201558 ...
 $ family_history: int  1 1 1 0 1 0 1 1 0 0 ...
 $ age_dx        : int  30 30 31 30 24 23 31 33 49 40 ...
 $ age_ref       : int  39 33 32 34 28 27 32 37 54 47 ...
 $ rstime        : num  9.08 3.17 2.09 4.5 3.83 3.83 1.33 4.5 5.16 7.25 ...
 $ Eigen_1       : num  -0.01369 -0.00753 -0.00687 -0.00363 -0.01057 ...
 $ Eigen_2       : num  0.00615 0.00853 0.00138 0.00428 0.00317 ...
 $ Eigen_3       : num  -0.00474 -0.01256 0.00329 0.03749 -0.00716 ...
 $ Eigen_4       : num  -0.00596 0.00849 -0.01233 -0.02952 -0.00024 ...
 $ Eigen_5       : num  -0.01219 0.01074 -0.0138 -0.0202 -0.00147 ...
 $ stage         : int  1 1 2 2 2 1 2 1 1 1 ...
 $ er1           : num  0 1 0 1 0 1 1 NA 1 NA ...
 $ pr1           : num  0 1 NA 1 0 1 0 NA 1 NA ...
 $ hist_cat      : chr  "other carcinoma" "ductal" "ductal" "ductal" ...
 $ hormone       : num  0 0 0 0 0 0 0 0 NA 0 ...
 $ chemo_cat     : chr  "CMF" "no" "Oth" "no" ...
 $ br_xray       : int  1 0 0 0 0 1 0 1 0 1 ...
 $ br_xray_dose  : num  1.1 0 0 0 0 1.2 0 1.9 0 0.86 ...
 $ num_preg      : int  2 0 1 1 1 0 0 1 1 0 ...
phenotypes_update.df[1:5,1:5]
dim(BRCA1_BRCA2_PALB2_cases.df)
[1] 11 12
str(BRCA1_BRCA2_PALB2_cases.df)
'data.frame':   11 obs. of  12 variables:
 $ Cases_wes_id : Factor w/ 11 levels "P1_A09","P1_C02",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ Cases_gwas_id: Factor w/ 11 levels "id204830","id247333",..: 8 9 3 1 6 2 4 11 5 7 ...
 $ SYMBOL       : Factor w/ 3 levels "BRCA1","BRCA2",..: 1 3 1 1 2 1 2 2 3 3 ...
 $ TYPE         : Factor w/ 2 levels "INDEL","SNP": 2 1 2 2 2 1 1 1 2 2 ...
 $ CHROM        : int  17 16 17 17 13 17 13 13 16 16 ...
 $ POS.GRCh37   : int  41256985 23641422 41243800 41258504 32936732 41276044 32914437 32914437 23625324 23634452 ...
 $ REF          : Factor w/ 6 levels "A","C","G","GT",..: 5 6 2 1 3 1 4 4 2 2 ...
 $ ALT          : Factor w/ 5 levels "A","ACT","C",..: 3 5 1 3 3 2 4 4 4 4 ...
 $ rs_id        : Factor w/ 9 levels "rs28897672","rs28897686",..: 7 5 2 1 8 6 9 9 4 3 ...
 $ Consequence  : Factor w/ 6 levels "frameshift_variant",..: 2 1 6 3 3 1 1 1 5 4 ...
 $ wes_var_id   : Factor w/ 9 levels "Var000223294",..: 7 5 6 8 2 9 1 1 3 4 ...
 $ Note         : Factor w/ 2 levels "","Added for QC": NA NA 2 2 NA 2 NA NA NA NA ...
BRCA1_BRCA2_PALB2_cases.df[1:5,1:5]
dim(vv.df)
[1] 343824     35
str(vv.df)
'data.frame':   343824 obs. of  35 variables:
 $ SplitVarID         : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ TYPE               : Factor w/ 2 levels "INDEL","SNP": 1 2 2 2 2 2 2 2 2 2 ...
 $ ID                 : Factor w/ 253381 levels "rs10000804","rs10000924",..: NA 234676 201998 253336 54527 NA NA 162750 NA NA ...
 $ CHROM              : Factor w/ 25 levels "1","10","11",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ POS                : int  664486 762330 865628 865694 871159 871171 871173 871215 871230 871271 ...
 $ REF                : Factor w/ 2142 levels "A","AAAAAAAAAAAAAAAAAAAAAAAG",..: 1653 1044 1044 539 1044 1 539 539 539 539 ...
 $ ALT                : Factor w/ 1297 levels "A","AAAAAAAAAAAAACT",..: 989 989 1 989 1 664 1 664 989 989 ...
 $ QUAL               : num  35191 7081 6420 991 649 ...
 $ DP                 : int  49355 26720 14628 8357 11648 12903 13516 16282 15415 10748 ...
 $ AS_VQSLOD          : num  1.57 7.04 16.2 18.18 16.05 ...
 $ FILTER             : Factor w/ 1 level "PASS": 1 1 1 1 1 1 1 1 1 1 ...
 $ AC                 : int  53 8 7 4 1 1 1 5 1 1 ...
 $ AF                 : num  0.044 0.00622 0.00519 0.00318 0.00082 ...
 $ AN                 : int  1206 1286 1348 1258 1220 1250 1258 1312 1310 1294 ...
 $ NEGATIVE_TRAIN_SITE: logi  NA NA NA NA NA NA ...
 $ POSITIVE_TRAIN_SITE: logi  NA NA NA TRUE NA NA ...
 $ SYMBOL             : Factor w/ 19874 levels "A1BG","A1CF",..: 14360 9127 14922 14922 14922 14922 14922 14922 14922 14922 ...
 $ Allele             : Factor w/ 1048 levels "-","A","AA","AAA",..: 1 790 2 790 2 533 2 533 790 790 ...
 $ Existing_variation : Factor w/ 307403 levels "","1KG_2_227731951",..: NA 255726 203059 307355 55938 218187 NA 163633 277286 NA ...
 $ Consequence        : Factor w/ 80 levels "3_prime_UTR_variant",..: 80 32 28 28 28 28 78 78 78 28 ...
 $ IMPACT             : Factor w/ 4 levels "HIGH","LOW","MODERATE",..: 4 4 3 3 3 3 2 2 2 3 ...
 $ CLIN_SIG           : Factor w/ 93 levels "association",..: NA NA NA NA NA NA NA NA NA NA ...
 $ cDNA_position      : Factor w/ 16588 levels "0-1","1","1-18",..: NA 11562 6108 7438 8899 9105 9146 9809 10047 10653 ...
 $ CDS_position       : Factor w/ 15333 levels "1","1-18","1-2",..: NA NA 3598 5147 6675 6879 6912 7603 7853 8506 ...
 $ Codons             : Factor w/ 3014 levels "-/A","-/AA","-/AAAA",..: NA NA 698 472 690 319 1167 1753 1848 1532 ...
 $ Protein_position   : Factor w/ 7359 levels "1","1-2","1-6",..: NA NA 5937 6741 125 216 216 514 620 910 ...
 $ Amino_acids        : Factor w/ 1851 levels "*","*/*EX","*/C",..: NA NA 589 695 589 1586 1583 1100 1329 1107 ...
 $ DISTANCE           : int  960 NA NA NA NA NA NA NA NA NA ...
 $ STRAND             : int  -1 -1 1 1 1 1 1 1 1 1 ...
 $ SYMBOL_SOURCE      : Factor w/ 6 levels "Clone_based_ensembl_gene",..: 2 3 3 3 3 3 3 3 3 3 ...
 $ SIFT_call          : Factor w/ 4 levels "deleterious",..: NA NA 2 1 3 1 NA NA NA 4 ...
 $ SIFT_score         : num  NA NA 0.01 0.04 1 0.05 NA NA NA 0.09 ...
 $ PolyPhen_call      : Factor w/ 4 levels "benign","possibly_damaging",..: NA NA 1 2 1 1 NA NA NA 1 ...
 $ PolyPhen_score     : num  NA NA 0.099 0.493 0.002 0.018 NA NA NA 0.045 ...
 $ Multiallelic       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
vv.df[1:5,1:5]
dim(kgen.df)
[1] 343824      9
str(kgen.df)
'data.frame':   343824 obs. of  9 variables:
 $ SplitVarID : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ kgen.AC    : int  NA NA 14 263 2 1 NA NA NA NA ...
 $ kgen.AN    : int  NA NA 5008 5008 5008 5008 NA NA NA NA ...
 $ kgen.AF    : num  NA NA 0.002796 0.052516 0.000399 ...
 $ kgen.AFR_AF: num  NA NA 0 0.0227 0 0 NA NA NA NA ...
 $ kgen.AMR_AF: num  NA NA 0.0072 0.0965 0 0 NA NA NA NA ...
 $ kgen.EAS_AF: num  NA NA 0 0.139 0 ...
 $ kgen.EUR_AF: num  NA NA 0.005 0.002 0.002 0.001 NA NA NA NA ...
 $ kgen.SAS_AF: num  NA NA 0.0041 0.0245 0 0 NA NA NA NA ...
kgen.df[1:5,1:5]
dim(exac.df)
[1] 343824     48
str(exac.df)
'data.frame':   343824 obs. of  48 variables:
 $ SplitVarID             : Factor w/ 343824 levels "Var000000001",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ exac_non_TCGA.AF       : num  NA 0.012 0.00278 0.025 0.00231 ...
 $ exac_non_TCGA.AC       : int  NA 175 295 2670 245 NA NA NA NA NA ...
 $ exac_non_TCGA.AN       : int  NA 14012 106020 106038 106210 NA NA NA NA NA ...
 $ exac_non_TCGA.AC_FEMALE: int  NA 59 94 1232 97 NA NA 1553 NA NA ...
 $ exac_non_TCGA.AN_FEMALE: int  NA 3452 18598 24670 45846 NA NA 45840 NA NA ...
 $ exac_non_TCGA.AC_MALE  : int  NA 91 170 1159 148 NA NA 1483 NA NA ...
 $ exac_non_TCGA.AN_MALE  : int  NA 8250 26380 33440 60312 NA NA 60304 NA NA ...
 $ exac_non_TCGA.AC_Adj   : int  NA 150 264 2391 245 NA NA 3036 NA NA ...
 $ exac_non_TCGA.AN_Adj   : int  NA 11702 44978 58110 106158 NA NA 106144 NA NA ...
 $ exac_non_TCGA.AC_Hom   : int  NA 0 1 102 2 NA NA 136 NA NA ...
 $ exac_non_TCGA.AC_Het   : int  NA 150 262 2187 241 NA NA 2760 NA NA ...
 $ exac_non_TCGA.AC_Hemi  : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_AFR   : int  NA 85 6 172 0 NA NA 263 NA NA ...
 $ exac_non_TCGA.AN_AFR   : int  NA 422 4194 5062 9054 NA NA 9040 NA NA ...
 $ exac_non_TCGA.Hom_AFR  : int  NA 0 0 2 0 NA NA 6 NA NA ...
 $ exac_non_TCGA.Het_AFR  : int  NA 85 6 168 0 NA NA 251 NA NA ...
 $ exac_non_TCGA.Hemi_AFR : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_AMR   : int  NA 2 20 979 0 NA NA 1180 NA NA ...
 $ exac_non_TCGA.AN_AMR   : int  NA 224 3570 6030 11216 NA NA 11210 NA NA ...
 $ exac_non_TCGA.Hom_AMR  : int  NA 0 0 46 0 NA NA 67 NA NA ...
 $ exac_non_TCGA.Het_AMR  : int  NA 2 20 887 0 NA NA 1046 NA NA ...
 $ exac_non_TCGA.Hemi_AMR : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_EAS   : int  NA 0 0 792 0 NA NA 889 NA NA ...
 $ exac_non_TCGA.AN_EAS   : int  NA 314 3432 4930 7866 NA NA 7854 NA NA ...
 $ exac_non_TCGA.Hom_EAS  : int  NA 0 0 47 0 NA NA 52 NA NA ...
 $ exac_non_TCGA.Het_EAS  : int  NA 0 0 698 0 NA NA 785 NA NA ...
 $ exac_non_TCGA.Hemi_EAS : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_FIN   : int  NA 0 9 4 149 NA NA 39 NA NA ...
 $ exac_non_TCGA.AN_FIN   : int  NA 30 1376 1960 6614 NA NA 6614 NA NA ...
 $ exac_non_TCGA.Hom_FIN  : int  NA 0 0 0 2 NA NA 0 NA NA ...
 $ exac_non_TCGA.Het_FIN  : int  NA 0 9 4 145 NA NA 39 NA NA ...
 $ exac_non_TCGA.Hemi_FIN : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_NFE   : int  NA 50 193 61 92 NA NA 172 NA NA ...
 $ exac_non_TCGA.AN_NFE   : int  NA 2944 22156 28886 54312 NA NA 54328 NA NA ...
 $ exac_non_TCGA.Hom_NFE  : int  NA 0 1 1 0 NA NA 2 NA NA ...
 $ exac_non_TCGA.Het_NFE  : int  NA 50 191 59 92 NA NA 168 NA NA ...
 $ exac_non_TCGA.Hemi_NFE : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_SAS   : int  NA 10 33 373 1 NA NA 478 NA NA ...
 $ exac_non_TCGA.AN_SAS   : int  NA 7650 9972 10884 16402 NA NA 16404 NA NA ...
 $ exac_non_TCGA.Hom_SAS  : int  NA 0 0 6 0 NA NA 9 NA NA ...
 $ exac_non_TCGA.Het_SAS  : int  NA 10 33 361 1 NA NA 456 NA NA ...
 $ exac_non_TCGA.Hemi_SAS : int  NA NA NA NA NA NA NA NA NA NA ...
 $ exac_non_TCGA.AC_OTH   : int  NA 3 3 10 3 NA NA 15 NA NA ...
 $ exac_non_TCGA.AN_OTH   : int  NA 118 278 358 694 NA NA 694 NA NA ...
 $ exac_non_TCGA.Hom_OTH  : int  NA 0 0 0 0 NA NA 0 NA NA ...
 $ exac_non_TCGA.Het_OTH  : int  NA 3 3 10 3 NA NA 15 NA NA ...
 $ exac_non_TCGA.Hemi_OTH : int  NA NA NA NA NA NA NA NA NA NA ...
exac.df[1:5,1:5]
# Check consistence of rownames
sum(rownames(gt.mx) != rownames(gq.mx))
[1] 0
sum(rownames(gt.mx) != rownames(dp.mx))
[1] 0
sum(rownames(gt.mx) != rownames(vv.df))
[1] 0
sum(rownames(gt.mx) != rownames(kgen.df))
[1] 0
sum(rownames(gt.mx) != rownames(exac.df))
[1] 0
# Check consistence of colnames
sum(colnames(gt.mx) != colnames(gq.mx))
[1] 0
sum(colnames(gt.mx) != colnames(dp.mx))
[1] 0

explore_data_before_filtering

Genotypes NA rates Histogram of call rates per variant
Histograms of gq and dp in non-NA genotypes

# Fraction of NA genotypes before filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~4%
[1] 0.0406923
# Call rates per variant before filtering
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, breaks=50, xlab=NULL, main="Call rates per variant before genotypes filtering")

# Histogram of gq  before filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], breaks=50, main="Histogram of gq in non-NA genotypes (before filtering)", xlab=NULL)

# Histogram of dp before filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp in non-NA genotypes (before filtering)", xlab=NULL)

hist(dp.mx[!is.na(gt.mx)], breaks=2500, xlim=c(0,100), main="Histogram of dp in non-NA genotypes (before filtering, 0:100)", xlab=NULL)

rm(x,y)

filter_out_low_gq

Put NA to genotypes where gq < 20 : removes ~8% of non-NA genotypes

# num of genotypes to be removed
sum(gq.mx < min.gq, na.rm=TRUE)
[1] 19243468
# Fraction of genotypes to be removed
sum(gq.mx < min.gq, na.rm=TRUE)/sum(!is.na(gq.mx)) # ~8%
[1] 0.08235379
# Apply filter (to gt only !)
NA -> gt.mx[ gq.mx < min.gq ]
# Clean up
rm(min.gq)

explore_data_after_gq_filtering

dim() genotypes NA rates Histogram of call rates per variant Histograms of gq and dp in non-NA genotypes

dim(gt.mx)
[1] 343824    710
# Fraction of NA genotypes after filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~12%
[1] 0.1195218
# Call rates per variant after gq filtering
x <- ncol(gt.mx)
y <- apply(gt.mx, 1, function(z){1-sum(is.na(z))/x})
hist(y, xlab=NULL, main="Histogram of call rates per variant after gt filtering")

# Histogram of gq  after gq filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=50, main="Histogram of gq in non NA genotypes (after gq filtering)", xlab=NULL)
View(vv.df)

# Histogram of dp after gt filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp in non-NA genotypes (after gt filtering)", xlab=NULL)

hist(dp.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=2500, main="Histogram of dp in non-NA genotypes (after gt filtering, 0:100)", xlab=NULL)

# Clean up
rm(x, y)

filter_out_high_dp

put NA to genotypes where dp > 500 : removes <<1% of non-NA genotypes

# num of genotypes to be removed
sum(dp.mx > max.dp, na.rm=TRUE)
[1] 45313
# Fraction of genotypes to be removed (appr)
sum(dp.mx > max.dp, na.rm=TRUE)/sum(!is.na(gq.mx)) # <<1%
[1] 0.0001939202
# Apply filter (to gt only !)
NA -> gt.mx[ dp.mx > max.dp ]
# Clean up
rm(max.dp)

explore_data_after_gq_dp_filtering

dim() Genotypes NA rates Histogram of call rates per variant Histograms of gq and dp

dim(gt.mx)
[1] 343824    710
# Fraction of NA genotypes after filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~12%
[1] 0.1196978
# Call rates per variant after gq filtering
x <- ncol(gt.mx)
y <- apply(gt.mx, 1, function(z){1-sum(is.na(z))/x})
hist(y, xlab=NULL, main="Call rates per variant after gt+dp filtering")

# Histogram of gq  after gq+dp filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=50, main="Histogram of gq in non-NA genotypes (after gq+dp filtering)", xlab=NULL)

# Histogram of dp after gt filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp after gt+dp filtering", xlab=NULL)

hist(dp.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=500, main="Histogram of dp in non-NA genotypes (after gt+dp filtering, 0:100)", xlab=NULL)

# Clean up
rm(x, y)

filter_variants_by_final_call_rate

Remove variants with call rate < 80% after the gemnotypes filtering: removes ~18% of variants (343,824 -> 283,651)

# Estimate the proportion of variants to be removed
x <- ncol(gt.mx)
y <- apply(gt.mx, 1, function(z){1-sum(is.na(z))/x})
y[1:7]
Var000000001 Var000000002 Var000000003 Var000000004 Var000000005 Var000000006 Var000000007 
   0.7211268    0.8112676    0.7887324    0.5647887    0.6760563    0.7098592    0.7450704 
var.retained <- y >= min.call.rate
sum(var.retained) # 283.651
[1] 283651
1 - sum(var.retained)/nrow(gt.mx) # ~18%
[1] 0.1750111
# Remove variants with loaw call rates
gt.mx <- gt.mx[ var.retained, ]
dp.mx <- dp.mx[ var.retained, ]
gq.mx <- gq.mx[ var.retained, ]
vv.df <- vv.df[ var.retained, ]
kgen.df <- kgen.df[ var.retained, ]
exac.df <- exac.df[ var.retained, ]
# Clean-up
rm(min.call.rate, var.retained, x, y)

explore_data_after_gq_dp_cr_filtering

Histograms of gq and dp
NA rates in gq table
Histogram of call rates per variant

dim(gt.mx)
[1] 283651    710
# Fraction of NA genotypes after filtering
sum(is.na(gt.mx))/(dim(gt.mx)[1]*dim(gt.mx)[2]) # ~8%
[1] 0.07691938
# Call rates per variant after filtering
x <- ncol(gt.mx)
y <- apply(gt.mx,1,function(z){1-sum(is.na(z))/x})
hist(y, xlim=c(0,1), breaks=10, xlab=NULL, main="Call rates per variant after gq+dp+cr genotypes filtering")

# Histogram of gq  after filtering (when gt is not NA !)
hist(gq.mx[!is.na(gt.mx)], xlim=c(0,100), breaks=50, main="Histogram of gq in non-NA genotypes (after gq+dp+cr filtering)", xlab=NULL)

# Histogram of dp  after filtering (when gt is not NA !)
hist(dp.mx[!is.na(gt.mx)], breaks=50, main="Histogram of dp in non-NA genotypes (after gq+dp+cr filtering)", xlab=NULL)

hist(dp.mx[!is.na(gt.mx)], breaks=500, xlim=c(0,100), main="Histogram of dp in non-NA genotypes (after gq+dp+cr filtering, 0:100)", xlab=NULL)

# Clean-up
rm(x, y)

remove_unnecessary_data

mean(gq.mx, na.rm=TRUE)
[1] 84.51277
mean(dp.mx, na.rm=TRUE)
[1] 37.31567
rm(gq.mx, dp.mx)

data_summary

dim(gt.mx)
[1] 283651    710
class(gt.mx)
[1] "matrix"
gt.mx[1:5,1:5]
             HG00097 HG00099 HG00100 HG00102 HG00106
Var000000002       0       0      NA      NA       0
Var000000008       0      NA       0       0       0
Var000000009       0       0       0       0       0
Var000000012       0       0      NA       0       0
Var000000013       0       0      NA       0       0
dim(gq.mx)
Error: object 'gq.mx' not found

save_data

final_section

rr

ls() sessionInfo() Sys.time()

LS0tCnRpdGxlOiAiZmlsdGVyX2dlbm90eXBlc193ZWNhcmVfamFuMjAxNyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKc3RhcnRlZDogMDFNYXIyMDE2ICAKbGFzdCB1cGRhdGVkOiAxOUphbjIwMTcKCiMgTm90ZXMgCgpUaGUgZmlsdGVycyBhcmUgYXBwbGllZCBpbiB0aGUgZm9sbG93aW5nIG9yZGVyOiAgCmdxID4gMjAgIApkcCA8IDUwMCAgCmNhbGxfcmF0ZSA+IDAuOAoKZ3EgMjAgZmlsdGVyIGlzIHNldCBhcmJpdHJhcnk7IGhvd2V2ZXIsIGl0IGlzIGNvbnNpc3RlbnQgd2l0aCB3aGF0IGlzIGRvbmUgYnkgb3RoZXJzIAooZS5nLiBzZWUgQ2Fyc29uIEJNQyBCaW9pbmZvcm1hdGljcy4gMjAxNCAxNToxMjUpLiAKCkEgc21hbGwgbnVtYmVyIG9mIGdlbm90eXBlcyAoPDwxJSkgd2FzIGNvdmVyZWQgdG9vIGhpZ2ggdG8gYmUgdHJ1ZSAodXAgdG8gMS0yayBjb3ZlcmFnZSkuIApUaGVzZSBhcmUgb2J2aW91cyBtaXN0YWtlcywgYW5kIHRoZXkgaGF2ZSBiZWVuIHJlbW92ZWQgdG9vLiAgQXJiaXRyYXJpbHkgdGhlIHRocmVzaG9sZCBmb3IgCm1heCBEUCB3YXMgc2V0IHRvIDUwMCAoYXBwci4gMTAgZm9sZCBvZiBhdmVyYWdlIGNvdmVyYWdlKS4KCkl0IHdhcyBkaXNjdXNzZWQgd2l0aCBEQyB3aGV0aGVyIHRvIGZpbHRlciBjYXNlcyBieSBjYWxsIHJhdGUgcGVyIGNhc2UuIApUaGVyZSB3YXMgfjMgY2FzZXMgd2l0aCBsb3cgY292ZXJhZ2UgKDwyMCkgYW5kIGxvdyBjYWxsIHJhdGVzICg8NTAlKS4gCldlIGRlc2lkZWQgdG8ga2VlcCBzdWNoIGNhc2VzIGJlY2F1c2UgdGhlaXIgcmV0YWluZWQgZ2Vub3R5cGVzIHN0aWxsIHBhc3NlZCBhbGwgZmlsdGVycy4gCgojIHN0YXJ0X3NlY3Rpb24KCmBgYHtyIHN0YXJ0X3NlY3Rpb259CgojIFRpbWUgc3RhbXAKU3lzLnRpbWUoKQoKIyBGb2xkZXJzCnNldHdkKCIvc2NyYXRjaC9tZWRnZW4vc2NyaXB0cy93ZWNhcmVfc3RhdF8wMS4xNy9zY3JpcHRzIikKaW50ZXJpbV9kYXRhX2ZvbGRlciA8LSAiL3NjcmF0Y2gvbWVkZ2VuL3NjcmlwdHMvd2VjYXJlX3N0YXRfMDEuMTcvaW50ZXJpbV9kYXRhIgoKIyBUaHJlc2hvbGRzIGZvciBnZW5vdHlwZXMKbWluLmdxIDwtIDIwCm1heC5kcCA8LSA1MDAKCiMgVmFyaWFudHMgY2FsbCByYXRldGhyZXNob2xkCm1pbi5jYWxsLnJhdGUgPC0gMC44CgpgYGAKCiMgbG9hZF9kYXRhCgpgYGB7ciBsb2FkX2RhdGF9Cgpsb2FkKHBhc3RlKGludGVyaW1fZGF0YV9mb2xkZXIsICJyMDFfcmVhZF9hbmRfY2xlYW5fZGF0YV9qYW4yMDE3LlJEYXRhIiwgc2VwPSIvIikpCgpgYGAKCiMgY2hlY2tfZGF0YQoKYGBge3IgY2hlY2tfZGF0YX0KCmRpbShndC5teCkKY2xhc3MoZ3QubXgpCmd0Lm14WzE6NSwxOjVdCgpkaW0oZ3EubXgpCmNsYXNzKGdxLm14KQpncS5teFsxOjUsMTo1XQoKZGltKGRwLm14KQpjbGFzcyhkcC5teCkKZHAubXhbMTo1LDE6NV0KCmRpbShjb3Zhci5kZikKc3RyKGNvdmFyLmRmKQpjb3Zhci5kZlsxOjUsMTo1XQoKZGltKHNhbXBsZXMuZGYpCnN0cihzYW1wbGVzLmRmKQpzYW1wbGVzLmRmWzE6NSxdCgpkaW0oZGVtb2dyYXBoaWNzLmRmKQpzdHIoZGVtb2dyYXBoaWNzLmRmKQpkZW1vZ3JhcGhpY3MuZGZbMTo1LDE6NV0KCmRpbShwaGVub3R5cGVzX3VwZGF0ZS5kZikKc3RyKHBoZW5vdHlwZXNfdXBkYXRlLmRmKQpwaGVub3R5cGVzX3VwZGF0ZS5kZlsxOjUsMTo1XQoKZGltKEJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzLmRmKQpzdHIoQlJDQTFfQlJDQTJfUEFMQjJfY2FzZXMuZGYpCkJSQ0ExX0JSQ0EyX1BBTEIyX2Nhc2VzLmRmWzE6NSwxOjVdCgpkaW0odnYuZGYpCnN0cih2di5kZikKdnYuZGZbMTo1LDE6NV0KCmRpbShrZ2VuLmRmKQpzdHIoa2dlbi5kZikKa2dlbi5kZlsxOjUsMTo1XQoKZGltKGV4YWMuZGYpCnN0cihleGFjLmRmKQpleGFjLmRmWzE6NSwxOjVdCgojIENoZWNrIGNvbnNpc3RlbmNlIG9mIHJvd25hbWVzCnN1bShyb3duYW1lcyhndC5teCkgIT0gcm93bmFtZXMoZ3EubXgpKQpzdW0ocm93bmFtZXMoZ3QubXgpICE9IHJvd25hbWVzKGRwLm14KSkKc3VtKHJvd25hbWVzKGd0Lm14KSAhPSByb3duYW1lcyh2di5kZikpCnN1bShyb3duYW1lcyhndC5teCkgIT0gcm93bmFtZXMoa2dlbi5kZikpCnN1bShyb3duYW1lcyhndC5teCkgIT0gcm93bmFtZXMoZXhhYy5kZikpCgojIENoZWNrIGNvbnNpc3RlbmNlIG9mIGNvbG5hbWVzCnN1bShjb2xuYW1lcyhndC5teCkgIT0gY29sbmFtZXMoZ3EubXgpKQpzdW0oY29sbmFtZXMoZ3QubXgpICE9IGNvbG5hbWVzKGRwLm14KSkKCmBgYAoKIyBleHBsb3JlX2RhdGFfYmVmb3JlX2ZpbHRlcmluZwoKR2Vub3R5cGVzIE5BIHJhdGVzCkhpc3RvZ3JhbSBvZiBjYWxsIHJhdGVzIHBlciB2YXJpYW50ICAKSGlzdG9ncmFtcyBvZiBncSBhbmQgZHAgaW4gbm9uLU5BIGdlbm90eXBlcyAKCmBgYHtyIGV4cGxvcmVfZGF0YV9iZWZvcmVfZmlsdGVyaW5nfQoKIyBGcmFjdGlvbiBvZiBOQSBnZW5vdHlwZXMgYmVmb3JlIGZpbHRlcmluZwpzdW0oaXMubmEoZ3QubXgpKS8oZGltKGd0Lm14KVsxXSpkaW0oZ3QubXgpWzJdKSAjIH40JQoKIyBDYWxsIHJhdGVzIHBlciB2YXJpYW50IGJlZm9yZSBmaWx0ZXJpbmcKeCA8LSBuY29sKGd0Lm14KQp5IDwtIGFwcGx5KGd0Lm14LDEsZnVuY3Rpb24oeil7MS1zdW0oaXMubmEoeikpL3h9KQpoaXN0KHksIGJyZWFrcz01MCwgeGxhYj1OVUxMLCBtYWluPSJDYWxsIHJhdGVzIHBlciB2YXJpYW50IGJlZm9yZSBnZW5vdHlwZXMgZmlsdGVyaW5nIikKCiMgSGlzdG9ncmFtIG9mIGdxICBiZWZvcmUgZmlsdGVyaW5nICh3aGVuIGd0IGlzIG5vdCBOQSAhKQpoaXN0KGdxLm14WyFpcy5uYShndC5teCldLCBicmVha3M9NTAsIG1haW49Ikhpc3RvZ3JhbSBvZiBncSBpbiBub24tTkEgZ2Vub3R5cGVzIChiZWZvcmUgZmlsdGVyaW5nKSIsIHhsYWI9TlVMTCkKCiMgSGlzdG9ncmFtIG9mIGRwIGJlZm9yZSBmaWx0ZXJpbmcgKHdoZW4gZ3QgaXMgbm90IE5BICEpCmhpc3QoZHAubXhbIWlzLm5hKGd0Lm14KV0sIGJyZWFrcz01MCwgbWFpbj0iSGlzdG9ncmFtIG9mIGRwIGluIG5vbi1OQSBnZW5vdHlwZXMgKGJlZm9yZSBmaWx0ZXJpbmcpIiwgeGxhYj1OVUxMKQpoaXN0KGRwLm14WyFpcy5uYShndC5teCldLCBicmVha3M9MjUwMCwgeGxpbT1jKDAsMTAwKSwgbWFpbj0iSGlzdG9ncmFtIG9mIGRwIGluIG5vbi1OQSBnZW5vdHlwZXMgKGJlZm9yZSBmaWx0ZXJpbmcsIDA6MTAwKSIsIHhsYWI9TlVMTCkKCnJtKHgseSkKCmBgYAoKIyBmaWx0ZXJfb3V0X2xvd19ncQoKUHV0IE5BIHRvIGdlbm90eXBlcyB3aGVyZSBncSA8IDIwIDogcmVtb3ZlcyB+OCUgb2Ygbm9uLU5BIGdlbm90eXBlcwoKYGBge3IgZmlsdGVyX291dF9sb3dfZ3F9CgojIG51bSBvZiBnZW5vdHlwZXMgdG8gYmUgcmVtb3ZlZApzdW0oZ3EubXggPCBtaW4uZ3EsIG5hLnJtPVRSVUUpCgojIEZyYWN0aW9uIG9mIGdlbm90eXBlcyB0byBiZSByZW1vdmVkCnN1bShncS5teCA8IG1pbi5ncSwgbmEucm09VFJVRSkvc3VtKCFpcy5uYShncS5teCkpICMgfjglCgojIEFwcGx5IGZpbHRlciAodG8gZ3Qgb25seSAhKQpOQSAtPiBndC5teFsgZ3EubXggPCBtaW4uZ3EgXQoKIyBDbGVhbiB1cApybShtaW4uZ3EpCgpgYGAKCiMgZXhwbG9yZV9kYXRhX2FmdGVyX2dxX2ZpbHRlcmluZwoKZGltKCkKZ2Vub3R5cGVzIE5BIHJhdGVzCkhpc3RvZ3JhbSBvZiBjYWxsIHJhdGVzIHBlciB2YXJpYW50Ckhpc3RvZ3JhbXMgb2YgZ3EgYW5kIGRwIGluIG5vbi1OQSBnZW5vdHlwZXMKCmBgYHtyIGV4cGxvcmVfZGF0YV9hZnRlcl9ncV9maWx0ZXJpbmd9CgpkaW0oZ3QubXgpCgojIEZyYWN0aW9uIG9mIE5BIGdlbm90eXBlcyBhZnRlciBmaWx0ZXJpbmcKc3VtKGlzLm5hKGd0Lm14KSkvKGRpbShndC5teClbMV0qZGltKGd0Lm14KVsyXSkgIyB+MTIlCgojIENhbGwgcmF0ZXMgcGVyIHZhcmlhbnQgYWZ0ZXIgZ3EgZmlsdGVyaW5nCnggPC0gbmNvbChndC5teCkKeSA8LSBhcHBseShndC5teCwgMSwgZnVuY3Rpb24oeil7MS1zdW0oaXMubmEoeikpL3h9KQpoaXN0KHksIHhsYWI9TlVMTCwgbWFpbj0iSGlzdG9ncmFtIG9mIGNhbGwgcmF0ZXMgcGVyIHZhcmlhbnQgYWZ0ZXIgZ3QgZmlsdGVyaW5nIikKCiMgSGlzdG9ncmFtIG9mIGdxICBhZnRlciBncSBmaWx0ZXJpbmcgKHdoZW4gZ3QgaXMgbm90IE5BICEpCmhpc3QoZ3EubXhbIWlzLm5hKGd0Lm14KV0sIHhsaW09YygwLDEwMCksIGJyZWFrcz01MCwgbWFpbj0iSGlzdG9ncmFtIG9mIGdxIGluIG5vbiBOQSBnZW5vdHlwZXMgKGFmdGVyIGdxIGZpbHRlcmluZykiLCB4bGFiPU5VTEwpCgojIEhpc3RvZ3JhbSBvZiBkcCBhZnRlciBndCBmaWx0ZXJpbmcgKHdoZW4gZ3QgaXMgbm90IE5BICEpCmhpc3QoZHAubXhbIWlzLm5hKGd0Lm14KV0sIGJyZWFrcz01MCwgbWFpbj0iSGlzdG9ncmFtIG9mIGRwIGluIG5vbi1OQSBnZW5vdHlwZXMgKGFmdGVyIGd0IGZpbHRlcmluZykiLCB4bGFiPU5VTEwpCmhpc3QoZHAubXhbIWlzLm5hKGd0Lm14KV0sIHhsaW09YygwLDEwMCksIGJyZWFrcz0yNTAwLCBtYWluPSJIaXN0b2dyYW0gb2YgZHAgaW4gbm9uLU5BIGdlbm90eXBlcyAoYWZ0ZXIgZ3QgZmlsdGVyaW5nLCAwOjEwMCkiLCB4bGFiPU5VTEwpCgojIENsZWFuIHVwCnJtKHgsIHkpCgpgYGAKCiMgZmlsdGVyX291dF9oaWdoX2RwCgpwdXQgTkEgdG8gZ2Vub3R5cGVzIHdoZXJlIGRwID4gNTAwIDogcmVtb3ZlcyA8PDElIG9mIG5vbi1OQSBnZW5vdHlwZXMKCmBgYHtyIGZpbHRlcl9vdXRfaGlnaF9kcH0KCiMgbnVtIG9mIGdlbm90eXBlcyB0byBiZSByZW1vdmVkCnN1bShkcC5teCA+IG1heC5kcCwgbmEucm09VFJVRSkKCiMgRnJhY3Rpb24gb2YgZ2Vub3R5cGVzIHRvIGJlIHJlbW92ZWQgKGFwcHIpCnN1bShkcC5teCA+IG1heC5kcCwgbmEucm09VFJVRSkvc3VtKCFpcy5uYShncS5teCkpICMgPDwxJQoKIyBBcHBseSBmaWx0ZXIgKHRvIGd0IG9ubHkgISkKTkEgLT4gZ3QubXhbIGRwLm14ID4gbWF4LmRwIF0KCiMgQ2xlYW4gdXAKcm0obWF4LmRwKQoKYGBgCgojIGV4cGxvcmVfZGF0YV9hZnRlcl9ncV9kcF9maWx0ZXJpbmcKCmRpbSgpCkdlbm90eXBlcyBOQSByYXRlcyAKSGlzdG9ncmFtIG9mIGNhbGwgcmF0ZXMgcGVyIHZhcmlhbnQKSGlzdG9ncmFtcyBvZiBncSBhbmQgZHAgIAoKYGBge3IgZXhwbG9yZV9kYXRhX2FmdGVyX2dxX2RwX2ZpbHRlcmluZ30KCmRpbShndC5teCkKCiMgRnJhY3Rpb24gb2YgTkEgZ2Vub3R5cGVzIGFmdGVyIGZpbHRlcmluZwpzdW0oaXMubmEoZ3QubXgpKS8oZGltKGd0Lm14KVsxXSpkaW0oZ3QubXgpWzJdKSAjIH4xMiUKCiMgQ2FsbCByYXRlcyBwZXIgdmFyaWFudCBhZnRlciBncSBmaWx0ZXJpbmcKeCA8LSBuY29sKGd0Lm14KQp5IDwtIGFwcGx5KGd0Lm14LCAxLCBmdW5jdGlvbih6KXsxLXN1bShpcy5uYSh6KSkveH0pCmhpc3QoeSwgeGxhYj1OVUxMLCBtYWluPSJDYWxsIHJhdGVzIHBlciB2YXJpYW50IGFmdGVyIGd0K2RwIGZpbHRlcmluZyIpCgojIEhpc3RvZ3JhbSBvZiBncSAgYWZ0ZXIgZ3ErZHAgZmlsdGVyaW5nICh3aGVuIGd0IGlzIG5vdCBOQSAhKQpoaXN0KGdxLm14WyFpcy5uYShndC5teCldLCB4bGltPWMoMCwxMDApLCBicmVha3M9NTAsIG1haW49Ikhpc3RvZ3JhbSBvZiBncSBpbiBub24tTkEgZ2Vub3R5cGVzIChhZnRlciBncStkcCBmaWx0ZXJpbmcpIiwgeGxhYj1OVUxMKQoKIyBIaXN0b2dyYW0gb2YgZHAgYWZ0ZXIgZ3QgZmlsdGVyaW5nICh3aGVuIGd0IGlzIG5vdCBOQSAhKQpoaXN0KGRwLm14WyFpcy5uYShndC5teCldLCBicmVha3M9NTAsIG1haW49Ikhpc3RvZ3JhbSBvZiBkcCBhZnRlciBndCtkcCBmaWx0ZXJpbmciLCB4bGFiPU5VTEwpCmhpc3QoZHAubXhbIWlzLm5hKGd0Lm14KV0sIHhsaW09YygwLDEwMCksIGJyZWFrcz01MDAsIG1haW49Ikhpc3RvZ3JhbSBvZiBkcCBpbiBub24tTkEgZ2Vub3R5cGVzIChhZnRlciBndCtkcCBmaWx0ZXJpbmcsIDA6MTAwKSIsIHhsYWI9TlVMTCkKCiMgQ2xlYW4gdXAKcm0oeCwgeSkKCmBgYAoKIyBmaWx0ZXJfdmFyaWFudHNfYnlfZmluYWxfY2FsbF9yYXRlCgpSZW1vdmUgdmFyaWFudHMgd2l0aCBjYWxsIHJhdGUgPCA4MCUgYWZ0ZXIgdGhlIGdlbW5vdHlwZXMgZmlsdGVyaW5nOiByZW1vdmVzIH4xOCUgb2YgdmFyaWFudHMgKDM0Myw4MjQgLT4gMjgzLDY1MSkKCmBgYHtyIGZpbHRlcl92YXJpYW50c19ieV9maW5hbF9jYWxsX3JhdGV9CgojIEVzdGltYXRlIHRoZSBwcm9wb3J0aW9uIG9mIHZhcmlhbnRzIHRvIGJlIHJlbW92ZWQKeCA8LSBuY29sKGd0Lm14KQp5IDwtIGFwcGx5KGd0Lm14LCAxLCBmdW5jdGlvbih6KXsxLXN1bShpcy5uYSh6KSkveH0pCnlbMTo3XQp2YXIucmV0YWluZWQgPC0geSA+PSBtaW4uY2FsbC5yYXRlCnN1bSh2YXIucmV0YWluZWQpICMgMjgzLjY1MQoxIC0gc3VtKHZhci5yZXRhaW5lZCkvbnJvdyhndC5teCkgIyB+MTglCgojIFJlbW92ZSB2YXJpYW50cyB3aXRoIGxvYXcgY2FsbCByYXRlcwpndC5teCA8LSBndC5teFsgdmFyLnJldGFpbmVkLCBdCmRwLm14IDwtIGRwLm14WyB2YXIucmV0YWluZWQsIF0KZ3EubXggPC0gZ3EubXhbIHZhci5yZXRhaW5lZCwgXQp2di5kZiA8LSB2di5kZlsgdmFyLnJldGFpbmVkLCBdCmtnZW4uZGYgPC0ga2dlbi5kZlsgdmFyLnJldGFpbmVkLCBdCmV4YWMuZGYgPC0gZXhhYy5kZlsgdmFyLnJldGFpbmVkLCBdCgojIENsZWFuLXVwCnJtKG1pbi5jYWxsLnJhdGUsIHZhci5yZXRhaW5lZCwgeCwgeSkKCmBgYAoKIyBleHBsb3JlX2RhdGFfYWZ0ZXJfZ3FfZHBfY3JfZmlsdGVyaW5nCgpIaXN0b2dyYW1zIG9mIGdxIGFuZCBkcCAgCk5BIHJhdGVzIGluIGdxIHRhYmxlICAKSGlzdG9ncmFtIG9mIGNhbGwgcmF0ZXMgcGVyIHZhcmlhbnQKCmBgYHtyIGV4cGxvcmVfZGF0YV9hZnRlcl9maWx0ZXJpbmd9CgpkaW0oZ3QubXgpCgojIEZyYWN0aW9uIG9mIE5BIGdlbm90eXBlcyBhZnRlciBmaWx0ZXJpbmcKc3VtKGlzLm5hKGd0Lm14KSkvKGRpbShndC5teClbMV0qZGltKGd0Lm14KVsyXSkgIyB+OCUKCiMgQ2FsbCByYXRlcyBwZXIgdmFyaWFudCBhZnRlciBmaWx0ZXJpbmcKeCA8LSBuY29sKGd0Lm14KQp5IDwtIGFwcGx5KGd0Lm14LDEsZnVuY3Rpb24oeil7MS1zdW0oaXMubmEoeikpL3h9KQpoaXN0KHksIHhsaW09YygwLDEpLCBicmVha3M9MTAsIHhsYWI9TlVMTCwgbWFpbj0iQ2FsbCByYXRlcyBwZXIgdmFyaWFudCBhZnRlciBncStkcCtjciBnZW5vdHlwZXMgZmlsdGVyaW5nIikKCiMgSGlzdG9ncmFtIG9mIGdxICBhZnRlciBmaWx0ZXJpbmcgKHdoZW4gZ3QgaXMgbm90IE5BICEpCmhpc3QoZ3EubXhbIWlzLm5hKGd0Lm14KV0sIHhsaW09YygwLDEwMCksIGJyZWFrcz01MCwgbWFpbj0iSGlzdG9ncmFtIG9mIGdxIGluIG5vbi1OQSBnZW5vdHlwZXMgKGFmdGVyIGdxK2RwK2NyIGZpbHRlcmluZykiLCB4bGFiPU5VTEwpCgojIEhpc3RvZ3JhbSBvZiBkcCAgYWZ0ZXIgZmlsdGVyaW5nICh3aGVuIGd0IGlzIG5vdCBOQSAhKQpoaXN0KGRwLm14WyFpcy5uYShndC5teCldLCBicmVha3M9NTAsIG1haW49Ikhpc3RvZ3JhbSBvZiBkcCBpbiBub24tTkEgZ2Vub3R5cGVzIChhZnRlciBncStkcCtjciBmaWx0ZXJpbmcpIiwgeGxhYj1OVUxMKQpoaXN0KGRwLm14WyFpcy5uYShndC5teCldLCBicmVha3M9NTAwLCB4bGltPWMoMCwxMDApLCBtYWluPSJIaXN0b2dyYW0gb2YgZHAgaW4gbm9uLU5BIGdlbm90eXBlcyAoYWZ0ZXIgZ3ErZHArY3IgZmlsdGVyaW5nLCAwOjEwMCkiLCB4bGFiPU5VTEwpCgojIENsZWFuLXVwCnJtKHgsIHkpCgpgYGAKCgojIHJlbW92ZV91bm5lY2Vzc2FyeV9kYXRhCgpgYGB7ciByZW1vdmVfdW5uZWNlc3NhcnlfZGF0YX0KCm1lYW4oZ3EubXgsIG5hLnJtPVRSVUUpCm1lYW4oZHAubXgsIG5hLnJtPVRSVUUpCnJtKGdxLm14LCBkcC5teCkKCmBgYAoKCiMgZGF0YV9zdW1tYXJ5CgpgYGB7ciBkYXRhX3N1bW1hcnl9CgpkaW0oZ3QubXgpCmNsYXNzKGd0Lm14KQpndC5teFsxOjUsMTo1XQoKZGltKGNvdmFyLmRmKQpzdHIoY292YXIuZGYpCmNvdmFyLmRmWzE6NSwxOjVdCgpkaW0oc2FtcGxlcy5kZikKc3RyKHNhbXBsZXMuZGYpCnNhbXBsZXMuZGZbMTo1LF0KCmRpbShkZW1vZ3JhcGhpY3MuZGYpCnN0cihkZW1vZ3JhcGhpY3MuZGYpCmRlbW9ncmFwaGljcy5kZlsxOjUsMTo1XQoKZGltKHBoZW5vdHlwZXNfdXBkYXRlLmRmKQpzdHIocGhlbm90eXBlc191cGRhdGUuZGYpCnBoZW5vdHlwZXNfdXBkYXRlLmRmWzE6NSwxOjVdCgpkaW0oQlJDQTFfQlJDQTJfUEFMQjJfY2FzZXMuZGYpCnN0cihCUkNBMV9CUkNBMl9QQUxCMl9jYXNlcy5kZikKQlJDQTFfQlJDQTJfUEFMQjJfY2FzZXMuZGZbMTo1LDE6NV0KCmRpbSh2di5kZikKc3RyKHZ2LmRmKQp2di5kZlsxOjUsMTo1XQoKZGltKGtnZW4uZGYpCnN0cihrZ2VuLmRmKQprZ2VuLmRmWzE6NSwxOjVdCgpkaW0oZXhhYy5kZikKc3RyKGV4YWMuZGYpCmV4YWMuZGZbMTo1LDE6NV0KCiMgQ2hlY2sgY29uc2lzdGVuY2Ugb2Ygcm93bmFtZXMKc3VtKHJvd25hbWVzKGd0Lm14KSAhPSByb3duYW1lcyhncS5teCkpCnN1bShyb3duYW1lcyhndC5teCkgIT0gcm93bmFtZXMoZHAubXgpKQpzdW0ocm93bmFtZXMoZ3QubXgpICE9IHJvd25hbWVzKHZ2LmRmKSkKc3VtKHJvd25hbWVzKGd0Lm14KSAhPSByb3duYW1lcyhrZ2VuLmRmKSkKc3VtKHJvd25hbWVzKGd0Lm14KSAhPSByb3duYW1lcyhleGFjLmRmKSkKCiMgQ2hlY2sgY29uc2lzdGVuY2Ugb2YgY29sbmFtZXMKc3VtKGNvbG5hbWVzKGd0Lm14KSAhPSBjb2xuYW1lcyhncS5teCkpCnN1bShjb2xuYW1lcyhndC5teCkgIT0gY29sbmFtZXMoZHAubXgpKQoKYGBgCgojIHNhdmVfZGF0YQoKYGBge3Igc2F2ZV9kYXRhfQoKc2F2ZS5pbWFnZShwYXN0ZShpbnRlcmltX2RhdGFfZm9sZGVyLCAiczAyX2ZpbHRlcl9nZW5vdHlwZXNfYW5kX3ZhcmlhbnRzX2phbjIwMTcuUkRhdGEiLCBzZXA9Ii8iKSkKCmBgYAoKIyBmaW5hbF9zZWN0aW9uCgpgYGB7ciBmaW5hbF9zZWN0aW9ufQoKbHMoKQpzZXNzaW9uSW5mbygpClN5cy50aW1lKCkKCmBgYAoK